This file aims to reproduce the findings of Tian et al. 2011, "Genome-wide association study of leaf architecture in the
maize nested association mapping population".
use_gpu_num = 1
import os
import pandas as pd
import numpy as np
import re
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
device = "cuda" if torch.cuda.is_available() else "cpu"
if use_gpu_num in [0, 1]:
torch.cuda.set_device(use_gpu_num)
print(f"Using {device} device")
import tqdm
from tqdm import tqdm
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"
import dlgwas
from dlgwas.kegg import ensure_dir_path_exists
from dlgwas.kegg import get_cached_result
from dlgwas.kegg import put_cached_result
from dlgwas.dlfn import calc_cs
from dlgwas.dlfn import apply_cs
from dlgwas.dlfn import reverse_cs
/home/labmember/mambaforge/envs/pytorch_mamba/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Using cuda device
# set up directory for notebook artifacts
nb_name = '11_TianEtAl2011'
# import shutil
# # to rerun from scratch:
# shutil.rmtree('../models/'+nb_name, ignore_errors=True)
# shutil.rmtree('../reports/'+nb_name, ignore_errors=True)
ensure_dir_path_exists(dir_path = '../models/'+nb_name)
ensure_dir_path_exists(dir_path = '../reports/'+nb_name)
# Read in cleaned data
taxa_groupings = pd.read_csv('../models/10_TianEtAl2011/taxa_groupings.csv')
data = pd.read_csv('../models/10_TianEtAl2011/clean_data.csv')
# Define holdout sets (Populations)
uniq_pop = list(set(taxa_groupings['Population']))
print(str(len(uniq_pop))+" Unique Holdout Groups.")
taxa_groupings['Holdout'] = None
for i in range(len(uniq_pop)):
mask = (taxa_groupings['Population'] == uniq_pop[i])
taxa_groupings.loc[mask, 'Holdout'] = i
taxa_groupings
25 Unique Holdout Groups.
| Unnamed: 0 | sample | Population | Holdout | |
|---|---|---|---|---|
| 0 | 0 | Z001E0001 | B73 x B97 | 18 |
| 1 | 1 | Z001E0002 | B73 x B97 | 18 |
| 2 | 2 | Z001E0003 | B73 x B97 | 18 |
| 3 | 3 | Z001E0004 | B73 x B97 | 18 |
| 4 | 4 | Z001E0005 | B73 x B97 | 18 |
| ... | ... | ... | ... | ... |
| 4671 | 4671 | Z026E0196 | B73 x Tzi8 | 4 |
| 4672 | 4672 | Z026E0197 | B73 x Tzi8 | 4 |
| 4673 | 4673 | Z026E0198 | B73 x Tzi8 | 4 |
| 4674 | 4674 | Z026E0199 | B73 x Tzi8 | 4 |
| 4675 | 4675 | Z026E0200 | B73 x Tzi8 | 4 |
4676 rows × 4 columns
#randomly holdout a population if there is not a file with the population held out.
# Holdout_Int = 0
Holdout_Int_path = '../models/'+nb_name+'/holdout_pop_int.pkl'
if None != get_cached_result(Holdout_Int_path):
Holdout_Int = get_cached_result(Holdout_Int_path)
else:
Holdout_Int = int(np.random.choice([i for i in range(len(uniq_pop))], 1))
put_cached_result(Holdout_Int_path, Holdout_Int)
print("Holding out i="+str(Holdout_Int)+": "+uniq_pop[Holdout_Int])
mask = (taxa_groupings['Holdout'] == Holdout_Int)
train_idxs = list(taxa_groupings.loc[~mask, ].index)
test_idxs = list(taxa_groupings.loc[mask, ].index)
Holding out i=14: B73 x Il14H
[len(e) for e in [test_idxs, train_idxs]]
[194, 4482]
downsample_obs = True
train_n = 900
test_n = 100
if downsample_obs == True:
train_idxs = np.random.choice(train_idxs, train_n)
test_idxs = np.random.choice(test_idxs, test_n)
print([len(e) for e in [test_idxs, train_idxs]])
[100, 900]
# used to go from index in tensor to index in data so that the right xs tensor can be loaded in
idx_original = np.array(data.index)
y1 = data['leaf_length']
y2 = data['leaf_width']
y3 = data['upper_leaf_angle']
y1 = np.array(y1)
y2 = np.array(y2)
y3 = np.array(y3)
scale_dict = {
'y1':calc_cs(y1[train_idxs]),
'y2':calc_cs(y2[train_idxs]),
'y3':calc_cs(y3[train_idxs])
}
y1 = apply_cs(y1, scale_dict['y1'])
y2 = apply_cs(y2, scale_dict['y2'])
y3 = apply_cs(y3, scale_dict['y3'])
# loading this into memory causes the session to crash
y1_train = torch.from_numpy(y1[train_idxs])[:, None]
y2_train = torch.from_numpy(y2[train_idxs])[:, None]
y3_train = torch.from_numpy(y3[train_idxs])[:, None]
idx_original_train = torch.from_numpy(idx_original[train_idxs])
y1_test = torch.from_numpy(y1[test_idxs])[:, None]
y2_test = torch.from_numpy(y2[test_idxs])[:, None]
y3_test = torch.from_numpy(y3[test_idxs])[:, None]
idx_original_test = torch.from_numpy(idx_original[test_idxs])
class CustomDataset(Dataset):
device = "cuda" if torch.cuda.is_available() else "cpu"
if use_gpu_num in [0, 1]:
torch.cuda.set_device(use_gpu_num)
print(f"Using {device} device")
def __init__(self, y1, y2, y3, #xs,
idx_original,
transform = None, target_transform = None):
self.y1 = y1
self.y2 = y2
self.y3 = y3
# self.xs = xs
self.idx_original = idx_original
self.transform = transform
self.target_transform = target_transform
def __len__(self):
return len(self.y1)
def __getitem__(self, idx):
y1_idx = self.y1[idx].to(device).float()
y2_idx = self.y2[idx].to(device).float()
y3_idx = self.y3[idx].to(device).float()
# Change type of xs loaded !! ----------------------------------------
# load in xs as they are needed.
# Non-Hilbert Version
save_path = '../models/10_TianEtAl2011/markers/'
# Hilbert version
# save_path = '../models/'+nb_name+'/hilbert/'
save_file_path = save_path+'m'+str(int(self.idx_original[idx]))+'.npz'
xs_idx = np.load(save_file_path)['arr_0']
xs_idx = torch.from_numpy(xs_idx).to(device).float()
xs_idx = xs_idx.squeeze()
if self.transform:
xs_idx = self.transform(xs_idx)
if self.target_transform:
y1_idx = self.transform(y1_idx)
y2_idx = self.transform(y2_idx)
y3_idx = self.transform(y3_idx)
return xs_idx, y1_idx, y2_idx, y3_idx
Using cuda device
training_dataloader = DataLoader(
CustomDataset(
y1 = y1_train,
y2 = y2_train,
y3 = y3_train,
idx_original = idx_original_train
),
batch_size = 64,
shuffle = True)
testing_dataloader = DataLoader(
CustomDataset(
y1 = y1_test,
y2 = y2_test,
y3 = y3_test,
idx_original = idx_original_train
),
batch_size = 64,
shuffle = True)
xs_i, y1_i, y2_i, y3_i = next(iter(training_dataloader))
# del training_dataloader
# torch.cuda.empty_cache()
xs_i.shape
torch.Size([64, 943455, 4])
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.x_network = nn.Sequential(
nn.Flatten(),
nn.Linear(3773820, 64),
nn.BatchNorm1d(64),
nn.ReLU(),
nn.Linear(64, 1)
)
def forward(self, x):
x_out = self.x_network(x)
return x_out
model = NeuralNetwork().to(device)
# print(model)
model(xs_i)[0:3] # try prediction on one batch
tensor([[0.8475],
[0.8549],
[0.7640]], device='cuda:1', grad_fn=<SliceBackward0>)
def train_loop(dataloader, model, loss_fn, optimizer, silent = False):
size = len(dataloader.dataset)
for batch, (xs_i, y1_i, y2_i, y3_i) in enumerate(dataloader):
# Compute prediction and loss
pred = model(xs_i)
loss = loss_fn(pred, y1_i) # <----------------------------------------
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(y1_i) # <----------------
if not silent:
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def train_error(dataloader, model, loss_fn, silent = False):
size = len(dataloader.dataset)
num_batches = len(dataloader)
train_loss = 0
with torch.no_grad():
for xs_i, y1_i, y2_i, y3_i in dataloader:
pred = model(xs_i)
train_loss += loss_fn(pred, y1_i).item() # <----------------------
train_loss /= num_batches
return(train_loss)
def test_loop(dataloader, model, loss_fn, silent = False):
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss = 0
with torch.no_grad():
for xs_i, y1_i, y2_i, y3_i in dataloader:
pred = model(xs_i)
test_loss += loss_fn(pred, y1_i).item() # <-----------------------
test_loss /= num_batches
if not silent:
print(f"Test Error: Avg loss: {test_loss:>8f}")
return(test_loss)
def train_nn(
training_dataloader,
testing_dataloader,
model,
learning_rate = 1e-3,
batch_size = 64,
epochs = 500
):
# Initialize the loss function
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
loss_df['TrainMSE'] = np.nan
loss_df['TestMSE'] = np.nan
for t in tqdm(range(epochs)):
# print(f"Epoch {t+1}\n-------------------------------")
train_loop(training_dataloader, model, loss_fn, optimizer, silent = True)
loss_df.loc[loss_df.index == t, 'TrainMSE'
] = train_error(training_dataloader, model, loss_fn, silent = True)
loss_df.loc[loss_df.index == t, 'TestMSE'
] = test_loop(testing_dataloader, model, loss_fn, silent = True)
if (t+1)%10: # Cache in case training is interupted
# print(loss_df.loc[loss_df.index == t, ['TrainMSE', 'TestMSE']])
torch.save(model.state_dict(),
'../models/'+nb_name+'/model_'+str(t)+'_'+str(epochs)+'.pt') # convention is to use .pt or .pth
return([model, loss_df])
# don't run if either of these exist because there may be cases where we want the results but not the model
if not os.path.exists('../models/'+nb_name+'/model.pt'):
model, loss_df = train_nn(
training_dataloader,
testing_dataloader,
model,
learning_rate = 1e-3,
batch_size = 64,
epochs = 20
)
# experimental outputs:
# 1. Model
torch.save(model.state_dict(), '../models/'+nb_name+'/model.pt') # convention is to use .pt or .pth
# 2. loss_df
loss_df.to_csv('../reports/'+nb_name+'/loss_df.csv', index=False)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [19:04<00:00, 57.23s/it]
# model, loss_df = train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 1
# )
# # don't run if either of these exist because there may be cases where we want the results but not the model
# #| os.path.exists('../reports/'+nb_name+'/loss_df.csv')
# if not os.path.exists('../models/'+nb_name+'/model.pt'):
# model, loss_df = train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 250
# )
# # experimental outputs:
# # 1. Model
# torch.save(model.state_dict(), '../models/'+nb_name+'/model.pt') # convention is to use .pt or .pth
# # 2. loss_df
# loss_df.to_csv('../reports/'+nb_name+'/loss_df.csv', index=False)
# This uses 6650/8192 MiB Available
# Perhaps by using a larger batch size we can increase the used amount.
# Increasing the size of the model will require more memory so this should be monitored.
# for one iteration it takes 1/1 [04:28<00:00, 268.24s/it]
# NeuralNetwork(
# (x_network): Sequential(
# (0): Flatten(start_dim=1, end_dim=-1)
# (1): Linear(in_features=3773820, out_features=64, bias=True)
# (2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
# (3): ReLU()
# (4): Linear(in_features=64, out_features=1, bias=True)
# )
# )
# 45/500 [3:22:20<34:19:22, 271.57s/it]
loss_df = pd.read_csv('../reports/'+nb_name+'/loss_df.csv')
fig = go.Figure()
fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
mode='lines', name='Train'))
fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
mode='lines', name='Test'))
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=loss_df.Epoch,
y= reverse_cs(loss_df.TrainMSE,
scale_dict['y1']),
mode='lines', name='Train'))
fig.add_trace(go.Scatter(x=loss_df.Epoch,
y= reverse_cs(loss_df.TestMSE,
scale_dict['y1']),
mode='lines', name='Test'))
fig.show()
# # ! conda install captum -c pytorch -y
# # imports from captum library
# from captum.attr import LayerConductance, LayerActivation, LayerIntegratedGradients
# from captum.attr import IntegratedGradients, DeepLift, GradientShap, NoiseTunnel, FeatureAblation
# ig = IntegratedGradients(model)
# ig_nt = NoiseTunnel(ig)
# dl = DeepLift(model)
# gs = GradientShap(model)
# fa = FeatureAblation(model)
# ig_attr_test = ig.attribute(xs_test, n_steps=50)
# ig_nt_attr_test = ig_nt.attribute(xs_test)
# dl_attr_test = dl.attribute(xs_test)
# gs_attr_test = gs.attribute(xs_test, xs_train)
# fa_attr_test = fa.attribute(xs_test)
# [e.shape for e in [ig_attr_test,
# ig_nt_attr_test,
# dl_attr_test,
# gs_attr_test,
# fa_attr_test]]
# fig = go.Figure()
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
# y = ig_nt_attr_test.cpu().detach().numpy().mean(axis=0),
# mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
# y = dl_attr_test.cpu().detach().numpy().mean(axis=0),
# mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
# y = gs_attr_test.cpu().detach().numpy().mean(axis=0),
# mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
# y = fa_attr_test.cpu().detach().numpy().mean(axis=0),
# mode='lines', name='Test'))
# fig.show()
# len(dl_attr_test.cpu().detach().numpy().mean(axis = 0))
# ## Version 2, Predict `y1` (Anthesis), `y2` (Silking), and `y3` (ASI)
# Here each model will predict 3 values. The loss function is still mse, but the y tensors are concatenated
# class NeuralNetwork(nn.Module):
# def __init__(self):
# super(NeuralNetwork, self).__init__()
# self.x_network = nn.Sequential(
# nn.Linear(1106, 64),
# nn.BatchNorm1d(64),
# nn.ReLU(),
# nn.Linear(64, 3))
# def forward(self, x):
# x_out = self.x_network(x)
# return x_out
# model = NeuralNetwork().to(device)
# # print(model)
# xs_i, y1_i, y2_i, y3_i = next(iter(training_dataloader))
# model(xs_i).shape # try prediction on one batch
# def train_loop(dataloader, model, loss_fn, optimizer, silent = False):
# size = len(dataloader.dataset)
# for batch, (xs_i, y1_i, y2_i, y3_i) in enumerate(dataloader):
# # Compute prediction and loss
# pred = model(xs_i)
# loss = loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)) # <----------------------------------------
# # Backpropagation
# optimizer.zero_grad()
# loss.backward()
# optimizer.step()
# if batch % 100 == 0:
# loss, current = loss.item(), batch * len(y1_i) # <----------------
# if not silent:
# print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
# def train_error(dataloader, model, loss_fn, silent = False):
# size = len(dataloader.dataset)
# num_batches = len(dataloader)
# train_loss = 0
# with torch.no_grad():
# for xs_i, y1_i, y2_i, y3_i in dataloader:
# pred = model(xs_i)
# train_loss += loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)).item() # <----------------------
# train_loss /= num_batches
# return(train_loss)
# def test_loop(dataloader, model, loss_fn, silent = False):
# size = len(dataloader.dataset)
# num_batches = len(dataloader)
# test_loss = 0
# with torch.no_grad():
# for xs_i, y1_i, y2_i, y3_i in dataloader:
# pred = model(xs_i)
# test_loss += loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)).item() # <-----------------------
# test_loss /= num_batches
# if not silent:
# print(f"Test Error: Avg loss: {test_loss:>8f}")
# return(test_loss)
# def train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 500
# ):
# # Initialize the loss function
# loss_fn = nn.MSELoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
# loss_df['TrainMSE'] = np.nan
# loss_df['TestMSE'] = np.nan
# for t in tqdm.tqdm(range(epochs)):
# # print(f"Epoch {t+1}\n-------------------------------")
# train_loop(training_dataloader, model, loss_fn, optimizer, silent = True)
# loss_df.loc[loss_df.index == t, 'TrainMSE'
# ] = train_error(training_dataloader, model, loss_fn, silent = True)
# loss_df.loc[loss_df.index == t, 'TestMSE'
# ] = test_loop(testing_dataloader, model, loss_fn, silent = True)
# return([model, loss_df])
# model, loss_df = train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 500
# )
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
# mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
# mode='lines', name='Test'))
# fig.show()
# model, loss_df = train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 5000
# )
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
# mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
# mode='lines', name='Test'))
# fig.show()
# '../ext_data/zma/panzea/phenotypes/'
# # pd.read_table('../ext_data/zma/panzea/phenotypes/traitMatrix_maize282NAM_v15-130212.txt', low_memory = False)
# # pd.read_excel('../ext_data/zma/panzea/phenotypes/traitMatrix_maize282NAM_v15-130212_TraitDescritptions.xlsx')
# data = pd.read_table('../ext_data/zma/panzea/phenotypes/Buckler_etal_2009_Science_flowering_time_data-090807/markergenotypes062508.txt', skiprows=1
# ).reset_index().rename(columns = {'index': 'Geno_Code'})
# data
# px.scatter_matrix(data.loc[:, ['days2anthesis', 'days2silk', 'asi']])
# d2a = np.array(data['days2anthesis'])
# d2s = np.array(data['days2silk'])
# asi = np.array(data['asi'])
# xs = np.array(data.drop(columns = ['days2anthesis', 'days2silk', 'asi', 'pop', 'Geno_Code']))
# n_obs = xs.shape[0]
# np_seed = 9070707
# rng = np.random.default_rng(np_seed) # can be called without a seed
# test_pr = 0.2
# test_n = round(n_obs*test_pr)
# idxs = np.linspace(0, n_obs-1, num = n_obs).astype(int)
# rng.shuffle(idxs)
# test_idxs = idxs[0:test_n]
# train_idxs = idxs[test_n:-1]
# y1_train = torch.from_numpy(y1[train_idxs]).to(device).float()[:, None]
# y2_train = torch.from_numpy(y2[train_idxs]).to(device).float()[:, None]
# y3_train = torch.from_numpy(y3[train_idxs]).to(device).float()[:, None]
# xs_train = torch.from_numpy(xs[train_idxs]).to(device).float()
# y1_test = torch.from_numpy(y1[test_idxs]).to(device).float()[:, None]
# y2_test = torch.from_numpy(y2[test_idxs]).to(device).float()[:, None]
# y3_test = torch.from_numpy(y3[test_idxs]).to(device).float()[:, None]
# xs_test = torch.from_numpy(xs[test_idxs]).to(device).float()
# class CustomDataset(Dataset):
# def __init__(self, y1, y2, y3, xs, transform = None, target_transform = None):
# self.y1 = y1
# self.y2 = y2
# self.y3 = y3
# self.xs = xs
# self.transform = transform
# self.target_transform = target_transform
# def __len__(self):
# return len(self.y1)
# def __getitem__(self, idx):
# y1_idx = self.y1[idx]
# y2_idx = self.y2[idx]
# y3_idx = self.y3[idx]
# xs_idx = self.xs[idx]
# if self.transform:
# xs_idx = self.transform(xs_idx)
# if self.target_transform:
# y1_idx = self.transform(y1_idx)
# y2_idx = self.transform(y2_idx)
# y3_idx = self.transform(y3_idx)
# return xs_idx, y1_idx, y2_idx, y3_idx
# training_dataloader = DataLoader(
# CustomDataset(
# y1 = y1_train,
# y2 = y2_train,
# y3 = y3_train,
# xs = xs_train
# ),
# batch_size = 64,
# shuffle = True)
# testing_dataloader = DataLoader(
# CustomDataset(
# y1 = y1_test,
# y2 = y2_test,
# y3 = y3_test,
# xs = xs_test
# ),
# batch_size = 64,
# shuffle = True)
# xs.shape
y1 (Anthesis)¶# class NeuralNetwork(nn.Module):
# def __init__(self):
# super(NeuralNetwork, self).__init__()
# self.x_network = nn.Sequential(
# nn.Linear(1106, 64),
# nn.BatchNorm1d(64),
# nn.ReLU(),
# nn.Linear(64, 1))
# def forward(self, x):
# x_out = self.x_network(x)
# return x_out
# model = NeuralNetwork().to(device)
# # print(model)
# xs_i, y1_i, y2_i, y3_i = next(iter(training_dataloader))
# model(xs_i).shape # try prediction on one batch
# def train_loop(dataloader, model, loss_fn, optimizer, silent = False):
# size = len(dataloader.dataset)
# for batch, (xs_i, y1_i, y2_i, y3_i) in enumerate(dataloader):
# # Compute prediction and loss
# pred = model(xs_i)
# loss = loss_fn(pred, y1_i) # <----------------------------------------
# # Backpropagation
# optimizer.zero_grad()
# loss.backward()
# optimizer.step()
# if batch % 100 == 0:
# loss, current = loss.item(), batch * len(y1_i) # <----------------
# if not silent:
# print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
# def train_error(dataloader, model, loss_fn, silent = False):
# size = len(dataloader.dataset)
# num_batches = len(dataloader)
# train_loss = 0
# with torch.no_grad():
# for xs_i, y1_i, y2_i, y3_i in dataloader:
# pred = model(xs_i)
# train_loss += loss_fn(pred, y1_i).item() # <----------------------
# train_loss /= num_batches
# return(train_loss)
# def test_loop(dataloader, model, loss_fn, silent = False):
# size = len(dataloader.dataset)
# num_batches = len(dataloader)
# test_loss = 0
# with torch.no_grad():
# for xs_i, y1_i, y2_i, y3_i in dataloader:
# pred = model(xs_i)
# test_loss += loss_fn(pred, y1_i).item() # <-----------------------
# test_loss /= num_batches
# if not silent:
# print(f"Test Error: Avg loss: {test_loss:>8f}")
# return(test_loss)
# def train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 500
# ):
# # Initialize the loss function
# loss_fn = nn.MSELoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
# loss_df['TrainMSE'] = np.nan
# loss_df['TestMSE'] = np.nan
# for t in tqdm.tqdm(range(epochs)):
# # print(f"Epoch {t+1}\n-------------------------------")
# train_loop(training_dataloader, model, loss_fn, optimizer, silent = True)
# loss_df.loc[loss_df.index == t, 'TrainMSE'
# ] = train_error(training_dataloader, model, loss_fn, silent = True)
# loss_df.loc[loss_df.index == t, 'TestMSE'
# ] = test_loop(testing_dataloader, model, loss_fn, silent = True)
# return([model, loss_df])
# model, loss_df = train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 500
# )
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
# mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
# mode='lines', name='Test'))
# fig.show()
# # ! conda install captum -c pytorch -y
# # imports from captum library
# from captum.attr import LayerConductance, LayerActivation, LayerIntegratedGradients
# from captum.attr import IntegratedGradients, DeepLift, GradientShap, NoiseTunnel, FeatureAblation
# ig = IntegratedGradients(model)
# ig_nt = NoiseTunnel(ig)
# dl = DeepLift(model)
# gs = GradientShap(model)
# fa = FeatureAblation(model)
# ig_attr_test = ig.attribute(xs_test, n_steps=50)
# ig_nt_attr_test = ig_nt.attribute(xs_test)
# dl_attr_test = dl.attribute(xs_test)
# gs_attr_test = gs.attribute(xs_test, xs_train)
# fa_attr_test = fa.attribute(xs_test)
# [e.shape for e in [ig_attr_test,
# ig_nt_attr_test,
# dl_attr_test,
# gs_attr_test,
# fa_attr_test]]
# fig = go.Figure()
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
# y = ig_nt_attr_test.cpu().detach().numpy().mean(axis=0),
# mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
# y = dl_attr_test.cpu().detach().numpy().mean(axis=0),
# mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
# y = gs_attr_test.cpu().detach().numpy().mean(axis=0),
# mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
# y = fa_attr_test.cpu().detach().numpy().mean(axis=0),
# mode='lines', name='Test'))
# fig.show()
# len(dl_attr_test.cpu().detach().numpy().mean(axis = 0))
# ## Version 2, Predict `y1` (Anthesis), `y2` (Silking), and `y3` (ASI)
# Here each model will predict 3 values. The loss function is still mse, but the y tensors are concatenated
# class NeuralNetwork(nn.Module):
# def __init__(self):
# super(NeuralNetwork, self).__init__()
# self.x_network = nn.Sequential(
# nn.Linear(1106, 64),
# nn.BatchNorm1d(64),
# nn.ReLU(),
# nn.Linear(64, 3))
# def forward(self, x):
# x_out = self.x_network(x)
# return x_out
# model = NeuralNetwork().to(device)
# # print(model)
# xs_i, y1_i, y2_i, y3_i = next(iter(training_dataloader))
# model(xs_i).shape # try prediction on one batch
# def train_loop(dataloader, model, loss_fn, optimizer, silent = False):
# size = len(dataloader.dataset)
# for batch, (xs_i, y1_i, y2_i, y3_i) in enumerate(dataloader):
# # Compute prediction and loss
# pred = model(xs_i)
# loss = loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)) # <----------------------------------------
# # Backpropagation
# optimizer.zero_grad()
# loss.backward()
# optimizer.step()
# if batch % 100 == 0:
# loss, current = loss.item(), batch * len(y1_i) # <----------------
# if not silent:
# print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
# def train_error(dataloader, model, loss_fn, silent = False):
# size = len(dataloader.dataset)
# num_batches = len(dataloader)
# train_loss = 0
# with torch.no_grad():
# for xs_i, y1_i, y2_i, y3_i in dataloader:
# pred = model(xs_i)
# train_loss += loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)).item() # <----------------------
# train_loss /= num_batches
# return(train_loss)
# def test_loop(dataloader, model, loss_fn, silent = False):
# size = len(dataloader.dataset)
# num_batches = len(dataloader)
# test_loss = 0
# with torch.no_grad():
# for xs_i, y1_i, y2_i, y3_i in dataloader:
# pred = model(xs_i)
# test_loss += loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)).item() # <-----------------------
# test_loss /= num_batches
# if not silent:
# print(f"Test Error: Avg loss: {test_loss:>8f}")
# return(test_loss)
# def train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 500
# ):
# # Initialize the loss function
# loss_fn = nn.MSELoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
# loss_df['TrainMSE'] = np.nan
# loss_df['TestMSE'] = np.nan
# for t in tqdm.tqdm(range(epochs)):
# # print(f"Epoch {t+1}\n-------------------------------")
# train_loop(training_dataloader, model, loss_fn, optimizer, silent = True)
# loss_df.loc[loss_df.index == t, 'TrainMSE'
# ] = train_error(training_dataloader, model, loss_fn, silent = True)
# loss_df.loc[loss_df.index == t, 'TestMSE'
# ] = test_loop(testing_dataloader, model, loss_fn, silent = True)
# return([model, loss_df])
# model, loss_df = train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 500
# )
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
# mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
# mode='lines', name='Test'))
# fig.show()
# model, loss_df = train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 5000
# )
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
# mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
# mode='lines', name='Test'))
# fig.show()
# '../ext_data/zma/panzea/phenotypes/'
# # pd.read_table('../ext_data/zma/panzea/phenotypes/traitMatrix_maize282NAM_v15-130212.txt', low_memory = False)
# # pd.read_excel('../ext_data/zma/panzea/phenotypes/traitMatrix_maize282NAM_v15-130212_TraitDescritptions.xlsx')